import json
import re
import time
from pymongo import MongoClient
import requests


class JiaYuanSpider(object):
    def __init__(self):
        self.url_temp = 'https://search.jiayuan.com/v2/search_v2.php'
        # 构造请求头  cookie要换成自己浏览器的cookie
        self.headers = {
            # cookie信息换成自己的cookie
            'Cookie': 'guider_quick_search=on; accessID=202103150924595680; SESSION_HASH=1b565df188507bcdbdc052adb330652be6321e18; user_access=1; save_jy_login_name=15251693528; stadate1=272569046; myloc=32%7C3201; myage=21; mysex=m; myuid=272569046; myincome=50; COMMON_HASH=67e0cfc00edcb430489cc9483f1d0cd3; sl_jumper=%26cou%3D17%26omsg%3D0%26dia%3D0%26lst%3D2021-02-19; last_login_time=1615771547; user_attr=000000; pop_sj=0; PROFILE=273569046%3A%25E5%25B0%258F%25E9%25A9%25AC%25E5%2590%258C%25E5%25AD%25A6%3Am%3Aimages1.jyimg.com%2Fw4%2Fglobal%2Fi%3A0%3A%3A1%3Azwzp_m.jpg%3A1%3A1%3A50%3A10%3A3.0; pop_time=1615771579026; PHPSESSID=ae44e627844be5ef649bf6e96cc6962c; pop_avatar=1; main_search:273569046=%7C%7C%7C00; RAW_HASH=wh-OIJDeJy1X8NOMQ3aP1neiZp17TWqyx%2AyWF494yqKobfNsk8Xeysp0EBUwf6Sz6J1rmpU3wkD4PyqHj-YEgF2sPdVBm1SUFtIHk5FN1cXARdU.; is_searchv2=1',
            'Host': 'search.jiayuan.com',
            'Origin': 'https://search.jiayuan.com',
            'sec-ch-ua': '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"',
            'sec-ch-ua-mobile': '?0',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin'
        }
        # 构造请求体数据
        self.dat = {
            'sex': 'f',
            'key':'',
            'stc': '1:32,23:1',
            'sn': 'default',
            'sv': '1',
            'p': '1',
            'f': 'select',
            'listStyle': 'bigPhoto',
            'pri_uid': '273569046',
            'jsversion': 'v5',
        }
        # 初始化MongoDB连接
        self.client = MongoClient()
        self.collection = self.client['test']['jiayuan']

    # 请求并解析url
    def parse(self,url,page_num):
        dat = self.dat
        dat['p'] = page_num
        time.sleep(1)
        resp = requests.post(url,headers=self.headers,data=dat)
        return resp.content.decode()

    # 获取信息
    def get_content_list(self,str_html):
        json_html = re.findall(r'##(\{.*?\})##',str_html)[0]
        json_html = json.loads(json_html)
        user_list = json_html['userInfo']
        for user in user_list:
            item = {}
            item['uid'] = user['uid']
            item['nickname'] = user['nickname']
            item['sex'] = user['sex']
            item['marriage'] = user['marriage']
            item['height'] = user['height']
            item['education'] = user['education']
            item['age'] = user['age']
            item['work_location'] = user['work_location']
            item['shortnote'] = user['shortnote']
            print(item)
            self.save(item)

    # 数据保存
    def save(self,item):
        self.collection.insert(item)

    # 主程序
    def run(self):
        resp = requests.post(self.url_temp,headers=self.headers,data=self.dat)
        str_html = resp.content.decode()
        json_html = re.findall(r'##(\{.*?\})##',str_html)[0]
        json_html = json.loads(json_html)
        total_page = json_html['pageTotal']
        for i in range(1,int(total_page)+1):
            html_str = self.parse(self.url_temp,i)
            self.get_content_list(html_str)


if __name__ == '__main__':
    jiayuan = JiaYuanSpider()
    jiayuan.run()